This notebook reports the characterisation of molecular pathways associated with the signature idenitfied from cluster 2 of cancer genomes. See /Volumes/cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer_v2_1.Rmd.
Define colors for cancer primary sites
# Load information about analysed cancer primary sites
cancer.project.info.df <- read.csv("./Dataset/ICGC/projects_2023_10_30_04_53_06.tsv", sep = "\t") %>% separate(Project.Code, c("cancer.type", "project"), sep = "-") %>% dplyr::select(cancer.type, Primary.Site)
# Add considered donor information
snvs.dens.ratio.df <- readRDS("./001_Mut_density_Pan_cancer/rds/snvs.dens.ratio.df.rds")
snvs.dens.ratio.df <- snvs.dens.ratio.df %>% left_join(cancer.project.info.df, by = "cancer.type")
saveRDS(snvs.dens.ratio.df, "./001_Mut_density_Pan_cancer/rds/snvs.dens.ratio.df.rds")
donors.color.df <- snvs.dens.ratio.df %>% dplyr::select(donor, cancer.type) %>% left_join(cancer.project.info.df, by = "cancer.type")
# Define color palette
length(unique(donors.color.df$Primary.Site)) # 14 colours needed
primary.site.color.palette <- c("#662483", "#29235C", "#1D71B8", "#006633", "#B17F4A", "#DEDC00", "#BE1622",
"#2FAC66", "#F9B233", "#EA83B3", "#9D9D9C", "#E94E1B", "#683C11", "#A084BC")
primary.site.color.palette.df <- cbind.data.frame(Primary.Site = unique(donors.color.df$Primary.Site), color = primary.site.color.palette)
donors.color.df <- donors.color.df %>% left_join(primary.site.color.palette.df, by = "Primary.Site")
saveRDS(donors.color.df, "./003_Transcriptomics_cluster_analysis/rds/donors.color.df.rds")
For the rest on the analysis, we consider only protein coding genes
WARNING: ICGC is using either gene name and ensembl gene id (compact or with the id version) that complicates the analysis
Gene information is recovered from ensembl
# r
library(dplyr)
library(biomaRt)
# Recover protein coding gene annotation and informations from Biomart
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
att <- listAttributes(mart)
hsapiens.coding.genes <- getBM(attributes=c("ensembl_gene_id", "hgnc_symbol", "entrezgene_id"), filters='biotype', values=c('protein_coding'), mart=mart) # 23,222 genes
hsapiens.coding.genes <- hsapiens.coding.genes %>% dplyr::select(gene.id = ensembl_gene_id, gene.symbol = hgnc_symbol, ncbi.gene.id = entrezgene_id)
# Add both gene id and name in the same column
GOI.df <- c(hsapiens.coding.genes$gene.id, hsapiens.coding.genes$gene.symbol)
# Save results
saveRDS(hsapiens.coding.genes, "./003_Transcriptomics_cluster_analysis/rds/hsapiens.coding.genes.rds")
write.table(as.data.frame(GOI.df), "./003_Transcriptomics_cluster_analysis/GOI.txt", col.names = F, row.names = F, quote = F)
We consider all available donors with enough mutation to compute snvs density ratio
# Load snvs density ratio df
snvs.dens.ratio.df <- readRDS("./001_Mut_density_Pan_cancer/rds/snvs.dens.ratio.df.rds")
write.table(as.data.frame(snvs.dens.ratio.df$donor), "./003_Transcriptomics_cluster_analysis/donor.txt", col.names = F, row.names = F, quote = F) # 1,826 donors
The expression data for all donors is downloaded (Data repositories > select all donors > Sequencing-based Gene Expression 3.62 GB of data) from the ICGC to generate /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.tsv.gz
The downloaded files are formatted and filtered for genes of interest:
WARNING #1: Genes may be referred to using either their ensembl gene id or symbol WARNING #2: ensembl gene id can reports versions or not
# bash
# Recover useful columns
zcat /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.tsv.gz | awk '{print $1,$8, $10}' | gzip -c \
> /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.light.tsv.gz
# Remove gene.id version
zcat /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.light.tsv.gz | awk '{sub(/\..*$/, "", $2)} 1' | gzip -c \
> /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.light.mod.tsv.gz
# Select GOI
zcat /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.light.mod.tsv.gz |
grep -wFf /cephfs2/pmurat/OriCan/003_Transcriptomics_cluster_analysis/GOI.txt | gzip -c \
> /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.GOI.tsv.gz
# Delete temporary files
rm -f /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.light.tsv.gz
rm -f /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.light.mod.tsv.gz
# Count entries
wc -l /cephfs2/pmurat/OriCan/Dataset/ICGC/transcriptome/exp_seq_all_donors.GOI.tsv.gz # 3,374,583 entries
We then consider cancer types independently.
projects.info.df <- read.csv("./Dataset/ICGC/projects_2023_10_30_04_53_06.tsv", sep = "\t")
Cancer projets with transcriptomics data:
MALY-DE: Malignant Lymphoma, Germinal center B-cell derived lymphomas PACA-(AU, CA): Pancreatic cancer, Ductal adenocarcinoma BRCA-US: Breast cancer, Ductal & lobular PRAD-(CA, US, UK, CN, FR): Prostate cancer, Adenocarcinoma LIRI-JP: Liver cancer, Hepatocellular carcinoma (Virus associated) THCA-US, HNSC-US and ORCA-IN: Head and Neck Squamous Cell Carcinoma, Head and Neck Thyroid Carcinoma, Oral Cancer OV-(US, AU): Ovary cancer, Ovarian Serous Cystadenocarcinoma KIRC-US, KIRP-US, RECA-EU: Kidney cancer, Kidney Renal Clear Cell Carcinoma, Kidney Renal Papillary Cell Carcinoma , Renal Cell Cancer
Sequencing-based Gene Expression data report both the normalized_read_count and raw_read_count. The definition of normalized_read_count is unclear (see https://bioinformatics.stackexchange.com/questions/3247/what-is-the-icgc-normalized-read-count for a discussion). We then consider the raw counts (column 10) and apply the appropriate corrections.
For each donor, we add information about mutation density ratio (origin/±10kb flank) values for subsequent analysis.
# r
# SLURM
library(dplyr)
library(tidyr)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
setwd("/cephfs2/pmurat/OriCan")
# Load snvs density ratio df
snvs.dens.ratio.df <- readRDS("./001_Mut_density_Pan_cancer/rds/snvs.dens.ratio.df.rds")
snvs.dens.ratio.light.df <- snvs.dens.ratio.df %>% dplyr::select(donor, snvs.dens.ratio)
# Load gene information
hsapiens.coding.genes <- readRDS("./003_Transcriptomics_cluster_analysis/rds/hsapiens.coding.genes.rds")
# Load transcriptomics data
ICGC.transc.df <- read.table(gzfile("./Dataset/ICGC/transcriptome/exp_seq_all_donors.GOI.tsv.gz"), sep = " ")
colnames(ICGC.transc.df) <- c("donor", "gene", "raw.count")
# Convert gene.id into gene.symbol
# Prepare lookup table
gene.lookup.df <- cbind.data.frame(gene.id = unique(ICGC.transc.df$gene)) %>% left_join((hsapiens.coding.genes %>% dplyr::select(gene.id, gene.symbol)), by = "gene.id") %>%
mutate(gene = case_when(gene.symbol != "NA" ~ gene.symbol, T ~ gene.id)) %>% filter(gene != "") %>% dplyr::select(gene.id, gene)
# Swap column names
colnames(gene.lookup.df) <- c("gene", "gene.id")
# Join
ICGC.transc.df <- ICGC.transc.df %>% left_join(gene.lookup.df, by = "gene")
# Add snvs density ratio information and drop donors without snvs details
ICGC.transc.df <- ICGC.transc.df %>% left_join(snvs.dens.ratio.light.df, by = "donor") %>% dplyr::select(-gene) %>% drop_na() # 813 donors
saveRDS(ICGC.transc.df, file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.df.rds")
# Add gene length information
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
hg19.exons <- exonsBy(txdb, "gene")
hg19.exon.length <- width(hg19.exons)
hg19.gene.length.df <- sum(hg19.exon.length) %>% tibble::enframe(name = "gene", value = "exon.length")
# map the Entrez ID to gene symbol
gene.conversion.df <- AnnotationDbi::select(org.Hs.eg.db, keys=hg19.gene.length.df$gene, columns="SYMBOL", keytype="ENTREZID")
all.equal(hg19.gene.length.df$gene, gene.conversion.df$ENTREZID) # entries are unique
hg19.gene.length.df$gene.id <- gene.conversion.df$SYMBOL
hg19.gene.length.df <- hg19.gene.length.df %>% dplyr::select(-gene)
# Join
ICGC.transc.TPM.df <- ICGC.transc.df %>% left_join(hg19.gene.length.df, by = "gene.id")
# Compute gene expression in TPM
ICGC.transc.TPM.df <- ICGC.transc.TPM.df %>% group_by(donor, gene.id) %>%
summarise(raw.count = mean(raw.count, na.rm = T), snvs.dens.ratio = mean(snvs.dens.ratio, na.rm = T), exon.length = mean(exon.length, na.rm = T)) %>% mutate(reads.per.rpk = raw.count/exon.length, TPM = (reads.per.rpk*1000000)/sum(reads.per.rpk, na.rm = T)) %>% dplyr::select(-exon.length, -reads.per.rpk) %>% drop_na()
saveRDS(ICGC.transc.TPM.df, file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.df.rds")
# Reload table
ICGC.transc.TPM.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.df.rds")
We focus on a first time on pancreatic cancers (PACA-(AU, CA): Pancreatic cancer, Ductal adenocarcinoma) as they displays a wide distribution of mutation density ratio (origin/±10kb flank) values.
To identify mutagenic pathway associated with mutagenesis at origins, we cluster PACA tumors based on their transcriptional profile and characterise dysreguated pathways.
A Uniform Manifold Approximation and Projection (UMAP) approach is used to assess tumor heterogeneity in term of transcription.
# r
# SLURM and local
library(dplyr)
library(tidyr)
library(umap)
setwd("/cephfs2/pmurat/OriCan")
# Load gene expression data for all donors and cancer types
ICGC.transc.TPM.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.df.rds")
# Spread df
ICGC.transc.TPM.spread.df <- ICGC.transc.TPM.df %>% dplyr::select(donor, gene.id, TPM) %>% distinct()
ICGC.transc.TPM.spread.df <- ICGC.transc.TPM.spread.df %>% pivot_wider(names_from = gene.id, values_from = TPM, values_fn = first)
ICGC.transc.TPM.spread.df <- as.data.frame(ICGC.transc.TPM.spread.df)
row.names(ICGC.transc.TPM.spread.df) <- ICGC.transc.TPM.spread.df$donor
ICGC.transc.TPM.spread.df <- ICGC.transc.TPM.spread.df[,-1]
ICGC.transc.TPM.spread.df[is.na(ICGC.transc.TPM.spread.df)] <- 0
saveRDS(ICGC.transc.TPM.spread.df, file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.spread.df.rds")
dim(ICGC.transc.TPM.spread.df) # 813 donors, 19298 genes
# Reload data and perform UMAP
# The random_state argument of the umap function is to define the original random state of the analysis (equivalent to set.seed)
# from the cluster
ICGC.transc.TPM.spread.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.spread.df.rds")
ICGC.transc.TPM.umap.df <- umap(ICGC.transc.TPM.spread.df, n_components = 2, random_state = 50)
saveRDS(ICGC.transc.TPM.umap.df, file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.df.rds")
# Reopen locally
ICGC.transc.TPM.umap.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.df.rds")
# Format UMAP
umap.layout <- ICGC.transc.TPM.umap.df[["layout"]]
umap.layout <- data.frame(umap.layout)
ICGC.transc.TPM.umap.final.df <- cbind(umap.layout, donor = rownames(ICGC.transc.TPM.spread.df))
# Add primary site information
ICGC.transc.TPM.umap.final.df <- ICGC.transc.TPM.umap.final.df %>% left_join(donors.color.df, by = "donor") %>% distinct()
saveRDS(ICGC.transc.TPM.umap.final.df, file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.final.df.rds")
# We observe two independent clusters for blood and pancreatic cancer
# Blood clusters are made from MALY and CCLE - not considered for now
# Rerun UMAP analysis on pancreatic cancer
donor.info.df <- read.csv("./Dataset/ICGC/donor.all_projects.tsv", sep = "\t")
# We focus on two projects: PACA-AU and PACA-CA: Pancreatic cancer, Ductal adenocarcinoma.
# Pancreas (PACA)
PACA.cluster.id <- (donor.info.df %>% filter(project_code == "PACA-AU" | project_code == "PACA-CA"))$icgc_donor_id # 778 donors
ICGC.transc.TPM.spread.PACA.df <- ICGC.transc.TPM.spread.df[which(rownames(ICGC.transc.TPM.spread.df) %in% PACA.cluster.id),] # 194 donors
ICGC.transc.TPM.PACA.umap.df <- umap(ICGC.transc.TPM.spread.PACA.df, n_components = 2, random_state = 20)
umap.PACA.layout <- ICGC.transc.TPM.PACA.umap.df[["layout"]]
umap.PACA.layout <- data.frame(umap.PACA.layout)
ICGC.transc.TPM.umap.PACA.final.df <- cbind(umap.PACA.layout, donor = rownames(ICGC.transc.TPM.spread.PACA.df))
saveRDS(ICGC.transc.TPM.umap.PACA.final.df, file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.PACA.final.df.rds")
# Select PACA clusters
PACA.cluster.1.df <- ICGC.transc.TPM.umap.PACA.final.df %>% filter(X1 >= 1 & X2 >= 0) %>% mutate(cluster = "clust.1") # 34 donors
PACA.cluster.2.df <- ICGC.transc.TPM.umap.PACA.final.df %>% filter(X1 <= 1 & X2 >= 0) %>% mutate(cluster = "clust.2") # 33 donors
PACA.cluster.3.df <- ICGC.transc.TPM.umap.PACA.final.df %>% filter(X2 <= 0) %>% mutate(cluster = "clust.3") # 127 donors
PACA.clusters.summary.df <- rbind(PACA.cluster.1.df, PACA.cluster.2.df, PACA.cluster.3.df) %>% dplyr::select(donor, cluster) %>%
left_join(snvs.dens.ratio.df, by = "donor") %>% mutate(cancer.project = paste(cancer.type, cancer.project, sep = "-"))
saveRDS(PACA.clusters.summary.df, file = "./003_Transcriptomics_cluster_analysis/rds/PACA.clusters.summary.df.rds")
Plot UMAP and PACA cluster characterisation
library(dplyr)
library(ggplot2)
library(scales)
setwd("/Volumes/cephfs2/pmurat/OriCan")
# Load data
ICGC.transc.TPM.umap.final.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.final.df.rds")
ICGC.transc.TPM.umap.PACA.final.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.PACA.final.df.rds")
PACA.clusters.summary.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.clusters.summary.df.rds")
# For all cancer types
ICGC.transc.TPM.umap.final.plot <- ICGC.transc.TPM.umap.final.df %>% ggplot(aes(x = X1, y = X2)) +
geom_point(aes(color = color), shape = 16, alpha = 0.6) +
scale_color_identity(name = "Sites", breaks = ICGC.transc.TPM.umap.final.df$color, labels = ICGC.transc.TPM.umap.final.df$Primary.Site, guide = "legend") +
xlab("UMAP 1") + ylab("UMAP 2") + ggtitle("UMAP") +
theme_bw() + theme(aspect.ratio=1)
ICGC.transc.TPM.umap.final.plot
# For PACA project (colored by clusters)
ICGC.transc.TPM.umap.PACA.final.plot <- ICGC.transc.TPM.umap.PACA.final.df %>%
mutate(cluster = case_when(X1 >= 1 & X2 >= 0 ~ "cluster 1",
X1 <= 1 & X2 >= 0 ~ "cluster 2",
X2 <= 0 ~ "cluster 3")) %>%
ggplot(aes(x = X1, y = X2, color = cluster)) +
geom_point(shape = 16) +
scale_color_manual(values = c("#E41F1A", "#F39200", "#4F4A75")) +
xlab("UMAP 1") + ylab("UMAP 2") + ggtitle("UMAP - PACA cancers") +
theme_bw() + theme(aspect.ratio=1)
ICGC.transc.TPM.umap.PACA.final.plot
pdf("./Rplots/ICGC.transc.TPM.umap.final.plot.pdf", width=7, height=6, useDingbats=FALSE)
ICGC.transc.TPM.umap.final.plot
dev.off()
## quartz_off_screen
## 2
pdf("./Rplots/ICGC.transc.TPM.umap.PACA.final.plot.pdf", width=7, height=6, useDingbats=FALSE)
ICGC.transc.TPM.umap.PACA.final.plot
dev.off()
## quartz_off_screen
## 2
Characterise mutational burden of PACA clusters.
library(dplyr)
library(ggplot2)
library(scales)
setwd("/Volumes/cephfs2/pmurat/OriCan")
# Load data
ICGC.transc.TPM.umap.final.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.final.df.rds")
ICGC.transc.TPM.umap.PACA.final.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.umap.PACA.final.df.rds")
PACA.clusters.summary.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.clusters.summary.df.rds")
# snvs density ratio
PACA.clusters.snvs.plot <- PACA.clusters.summary.df %>% ggplot(aes(x=cluster, y=snvs.dens.ratio)) +
geom_boxplot(aes(fill = cluster), alpha = 0.5) + geom_jitter(alpha = 0.2) +
scale_fill_manual(values = c("#E41F1A", "#F39200", "#4F4A75")) + ylab("Mutation density ratio (ori/flanks)") +
ggtitle("Pval = 1.643e-06 & 9.303e-09") + ylim(0,6.5) +
theme_bw() + theme(aspect.ratio = 2, axis.title.x=element_blank(), legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 0.5, vjust = 0.5))
PACA.clusters.snvs.plot
pdf("./Rplots/PACA.clusters.snvs.plot.pdf", width=7, height=6, useDingbats=FALSE)
PACA.clusters.snvs.plot
dev.off()
## quartz_off_screen
## 2
# Total number of snvs
PACA.clusters.snvs.total.plot <- PACA.clusters.summary.df %>% ggplot(aes(x=cluster, y=snvs.count)) +
geom_boxplot(aes(fill = cluster), alpha = 0.5) + geom_jitter(alpha = 0.2) +
scale_fill_manual(values = c("#E41F1A", "#F39200", "#4F4A75")) + ylab("Genome-wide snvs count") +
scale_y_continuous(trans = log10_trans(),
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x)), limits = c(2000,200000)) +
ggtitle("Pval = 0.3795 & 0.1115") +
theme_bw() + theme(aspect.ratio = 2, axis.title.x=element_blank(), legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 0.5, vjust = 0.5))
PACA.clusters.snvs.total.plot
pdf("./Rplots/PACA.clusters.snvs.total.plot.pdf", width=7, height=6, useDingbats=FALSE)
PACA.clusters.snvs.total.plot
dev.off()
## quartz_off_screen
## 2
# Compute stats
ks.test(PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.1"),]$snvs.dens.ratio,
PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"),]$snvs.dens.ratio) # p-value = 1.643e-06
##
## Two-sample Kolmogorov-Smirnov test
##
## data: PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.1"), ]$snvs.dens.ratio and PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"), ]$snvs.dens.ratio
## D = 0.37344, p-value = 1.643e-06
## alternative hypothesis: two-sided
ks.test(PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"),]$snvs.dens.ratio,
PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.3"),]$snvs.dens.ratio) # p-value = 9.303e-09
##
## Two-sample Kolmogorov-Smirnov test
##
## data: PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"), ]$snvs.dens.ratio and PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.3"), ]$snvs.dens.ratio
## D = 0.34884, p-value = 9.303e-09
## alternative hypothesis: two-sided
ks.test(PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.1"),]$snvs.count,
PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"),]$snvs.count) # p-value = 0.3795
##
## Two-sample Kolmogorov-Smirnov test
##
## data: PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.1"), ]$snvs.count and PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"), ]$snvs.count
## D = 0.12834, p-value = 0.3795
## alternative hypothesis: two-sided
ks.test(PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"),]$snvs.count,
PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.3"),]$snvs.count) # p-value = 0.1115
##
## Two-sample Kolmogorov-Smirnov test
##
## data: PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.2"), ]$snvs.count and PACA.clusters.summary.df[which(PACA.clusters.summary.df$cluster == "clust.3"), ]$snvs.count
## D = 0.13531, p-value = 0.1115
## alternative hypothesis: two-sided
In this section, we compute the mutational signatures at origins (as previously done) for clustered PACA tumours.
# r
library(dplyr)
library(MutationalPatterns)
setwd("/Volumes/cephfs2/pmurat/OriCan")
# Load cluster information
PACA.clusters.summary.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.clusters.summary.df.rds")
# Recover donor id
PACA.cluster.1.id <- PACA.cluster.1.df$donor # 34 genomes
PACA.cluster.2.id <- PACA.cluster.2.df$donor # 33 genomes
PACA.cluster.3.id <- PACA.cluster.3.df$donor # 127 genomes
# Load mutational differential matrix
snvs.diff.mut.mat <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.diff.mut.mat.rds")
# Subset donors
PACA.cluster.1.id <- PACA.cluster.1.id[PACA.cluster.1.id %in% colnames(as.data.frame(snvs.diff.mut.mat))] # 5
PACA.cluster.2.id <- PACA.cluster.2.id[PACA.cluster.2.id %in% colnames(as.data.frame(snvs.diff.mut.mat))] # 2
PACA.cluster.3.id <- PACA.cluster.3.id[PACA.cluster.3.id %in% colnames(as.data.frame(snvs.diff.mut.mat))] # 11
# Recover cluster specific profiles
PACA.cluster.1.mat <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(PACA.cluster.1.id)) %>% mutate(PACA.1 = rowSums(.)) %>% dplyr::select(PACA.1)
PACA.cluster.2.mat <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(PACA.cluster.2.id)) %>% mutate(PACA.2 = rowSums(.)) %>% dplyr::select(PACA.2)
PACA.cluster.3.mat <- as.data.frame(snvs.diff.mut.mat) %>% dplyr::select(all_of(PACA.cluster.3.id)) %>% mutate(PACA.3 = rowSums(.)) %>% dplyr::select(PACA.3)
# Combine information
PACA.cluster.mat <- cbind.data.frame(PACA.cluster.1.mat, PACA.cluster.2.mat, PACA.cluster.3.mat)
PACA.cluster.mat <- as.matrix(PACA.cluster.mat)
saveRDS(PACA.cluster.mat, "./003_Transcriptomics_cluster_analysis/rds/PACA.cluster.mat.rds")
# Plot profiles
plot_96_profile(PACA.cluster.mat, condensed = TRUE, ymax = 0.4)
Plot
# r
library(dplyr)
library(MutationalPatterns)
setwd("/Volumes/cephfs2/pmurat/OriCan")
PACA.cluster.mat <- readRDS("./003_Transcriptomics_cluster_analysis/rds/PACA.cluster.mat.rds")
# Plot profiles
plot_96_profile(PACA.cluster.mat, condensed = TRUE, ymax = 0.4)
pdf("./Rplots/PACA.cluster.mut.sig.profile.pdf", width=5, height=4, useDingbats=FALSE)
plot_96_profile(PACA.cluster.mat, condensed = TRUE, ymax = 0.4)
dev.off()
## quartz_off_screen
## 2
# Compute cosine similarity with cluster 2 SBS
# Load cluster mutation matrix
snvs.ori.mut.cluster <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.signatures.rds")
# Compute cosine matrix
snvs.ori.PACA.sim.mat <- cos_sim_matrix(PACA.cluster.mat, snvs.ori.mut.cluster)
snvs.ori.PACA.sim.mat
## cluster.1 cluster.2 cluster.3 cluster.4 cluster.5
## PACA.1 0.1286583 0.9731505 0.2427472 0.1377507 0.3576658
## PACA.2 0.1042018 0.9059249 0.3786150 0.1214611 0.3819263
## PACA.3 0.3094851 0.6913128 0.4125451 0.3720286 0.8004612
PACA signatures have high cosine similarity with the previously identifed cluster 2 SBS.
In this section, we compute the local exposure of cluster 2 SBS at origins (as previously done) for clustered PACA tumours.
Mutations from genomes of each cluster are combined and spatially resolved in bins of 100 nt around origins. The extracted signatures are then fitted to the mutation count matrices in order to characterise their spatial contribution.
Combine vcf files
# PACA 1
# Recover donor id, file names and paths
PACA.1.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% PACA.cluster.1.id))$file.name # 5 genomes
PACA.1.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", PACA.1.vcf.name, sep = "")
# Combine vcf files
PACA.1.snvs.df <- tibble()
for (i in 1:length(PACA.1.vcf.path)) {
print(i/length(PACA.1.vcf.path))
vcf.i <- read.table(gzfile(PACA.1.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
PACA.1.snvs.df <- rbind(PACA.1.snvs.df, vcf.i)
}
write.table(PACA.1.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)
# PACA 2
# Recover donor id, file names and paths
PACA.2.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% PACA.cluster.2.id))$file.name # 2 genomes
PACA.2.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", PACA.2.vcf.name, sep = "")
# Combine vcf files
PACA.2.snvs.df <- tibble()
for (i in 1:length(PACA.2.vcf.path)) {
print(i/length(PACA.2.vcf.path))
vcf.i <- read.table(gzfile(PACA.2.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
PACA.2.snvs.df <- rbind(PACA.2.snvs.df, vcf.i)
}
write.table(PACA.2.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)
# PACA 3
# Recover donor id, file names and paths
PACA.3.vcf.name <- (snvs.manifest.df %>% filter(donor.id %in% PACA.cluster.3.id))$file.name # 11 genomes
PACA.3.vcf.path <- paste("./Dataset/ICGC/SSM/VCF/", PACA.3.vcf.name, sep = "")
# Combine vcf files
PACA.3.snvs.df <- tibble()
for (i in 1:length(PACA.3.vcf.path)) {
print(i/length(PACA.3.vcf.path))
vcf.i <- read.table(gzfile(PACA.3.vcf.path[i]), comment.char = "#") %>% dplyr::select(V1, V2, V4, V5) %>% mutate(end = as.integer(V2 + 1)) %>% dplyr::select(seqnames = V1, start = V2, end, REF = V4, ALT = V5)
PACA.3.snvs.df <- rbind(PACA.3.snvs.df, vcf.i)
}
write.table(PACA.3.snvs.df, "./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.bed", sep="\t", col.names = F, row.names = F, quote = F)
Compute snvs-origin distances and filter mutations at less than 10kb of an origin
# bash/SLURM
# PACA 1
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.closest.bed
# PACA 2
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.closest.bed
# PACA 3
sort -k1,1 -k2,2n /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.sorted.bed
bedtools closest -a /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.sorted.bed \
-b /cephfs2/pmurat/OriCan/Dataset/ori/ori.inter.hg19.NCBI.bed -D ref \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.closest.bed
awk '$12>=-10000 && $12<=10000' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.closest.bed \
> /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.10kb.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.sorted.bed
rm -f /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.closest.bed
Split bed files according to origin distances
# r
# PACA 1
PACA.1.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.1.snvs.10kb.bed", sep = "\t")
PACA.1.snvs.10kb.df <- PACA.1.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(PACA.1.snvs.10kb.df$dist)) {
PACA.1.vcf <- PACA.1.snvs.10kb.df %>% filter(dist == i) %>%
mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
colnames(PACA.1.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_1/Bin_",i,".bed")
write.table(PACA.1.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}
# PACA 2
PACA.2.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.2.snvs.10kb.bed", sep = "\t")
PACA.2.snvs.10kb.df <- PACA.2.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(PACA.2.snvs.10kb.df$dist)) {
PACA.2.vcf <- PACA.2.snvs.10kb.df %>% filter(dist == i) %>%
mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
colnames(PACA.2.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_2/Bin_",i,".bed")
write.table(PACA.2.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}
# PACA 3
PACA.3.snvs.10kb.df <- read.table("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA.3.snvs.10kb.bed", sep = "\t")
PACA.3.snvs.10kb.df <- PACA.3.snvs.10kb.df %>% mutate(dist = (as.numeric(cut(-V12, breaks = seq(-10000, 10000, 100)))-100)*100) %>% drop_na()
for (i in unique(PACA.3.snvs.10kb.df$dist)) {
PACA.3.vcf <- PACA.3.snvs.10kb.df %>% filter(dist == i) %>%
mutate(ID = paste("rs", V2, sep = ""), QUAL = ".", FILTER = ".", INFO = ".", FORMAT = ".") %>%
dplyr::select(V1, V2, ID, V4, V5, QUAL, FILTER, INFO, FORMAT)
colnames(PACA.3.vcf) <- c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')
vcffile <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_3/Bin_",i,".bed")
write.table(PACA.3.vcf, vcffile, sep="\t", col.names = T, row.names = F, quote = F)
}
Convert to vcf format
# bash/SLURM
# PACA 1
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_1/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_1/$file.bed > /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_1/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_1
# PACA 2
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_2/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_2/$file.bed > /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_2/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_2
# PACA 3
for filename in /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_3/*.bed
do
path=${filename%.*}
file=${path##*/}
echo $file
awk 'BEGIN{print"##fileformat=VCFv4.1"}1' /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_3/$file.bed > /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_3/$file.vcf
done
rm -rf /cephfs2/pmurat/OriCan/002_Mut_signatures_Pan_cancer/sig/cluster/BED/PACA_3
Extract mutational count matrices
# r
library(dplyr)
library(MutationalPatterns)
setwd("/Volumes/cephfs2/pmurat/OriCan")
# Load genome
ref.genome <- BSgenome.Hsapiens.UCSC.hg19
# PACA 1
vcfFiles.PACA.1 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_1/"), value = TRUE)
PACA.1.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_1/", vcfFiles.PACA.1)
PACA.1.sample.names <- vcfFiles.PACA.1 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
PACA.1.snvs.grl <- read_vcfs_as_granges(PACA.1.files, PACA.1.sample.names, ref.genome)
saveRDS(PACA.1.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/PACA.1.snvs.grl.rds")
# PACA 2
vcfFiles.PACA.2 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_2/"), value = TRUE)
PACA.2.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_2/", vcfFiles.PACA.2)
PACA.2.sample.names <- vcfFiles.PACA.2 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
PACA.2.snvs.grl <- read_vcfs_as_granges(PACA.2.files, PACA.2.sample.names, ref.genome)
saveRDS(PACA.2.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/PACA.2.snvs.grl.rds")
# PACA 3
vcfFiles.PACA.3 <- grep("*.vcf$", dir("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_3/"), value = TRUE)
PACA.3.files <- paste0("./002_Mut_signatures_Pan_cancer/sig/cluster/VCF/PACA_3/", vcfFiles.PACA.3)
PACA.3.sample.names <- vcfFiles.PACA.3 %>% str_replace(".vcf", "") %>% str_replace("Bin_", "")
PACA.3.snvs.grl <- read_vcfs_as_granges(PACA.3.files, PACA.3.sample.names, ref.genome)
saveRDS(PACA.3.snvs.grl, file = "./002_Mut_signatures_Pan_cancer/sig/PACA.3.snvs.grl.rds")
# Load grange lists
PACA.1.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/PACA.1.snvs.grl.rds")
PACA.2.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/PACA.2.snvs.grl.rds")
PACA.3.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/PACA.3.snvs.grl.rds")
# Compute mutation count matrices
PACA.1.mut.mat <- mut_matrix(vcf_list = PACA.1.snvs.grl, ref_genome = ref.genome)
PACA.2.mut.mat <- mut_matrix(vcf_list = PACA.2.snvs.grl, ref_genome = ref.genome)
PACA.3.mut.mat <- mut_matrix(vcf_list = PACA.3.snvs.grl, ref_genome = ref.genome)
# Save
saveRDS(PACA.1.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/PACA.1.mut.mat.rds")
saveRDS(PACA.2.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/PACA.2.mut.mat.rds")
saveRDS(PACA.3.mut.mat, file = "./002_Mut_signatures_Pan_cancer/sig/PACA.3.mut.mat.rds")
# r
# Load mutation count matrices
cluster.1.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.1.mut.mat.rds")
cluster.2.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.mut.mat.rds")
cluster.3.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.3.mut.mat.rds")
cluster.4.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.4.mut.mat.rds")
cluster.5.mut.mat <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/cluster.5.mut.mat.rds")
# Load identified signatures
snvs.ori.mut.cluster <- readRDS("./002_Mut_signatures_Pan_cancer/rds/snvs.ori.mut.cluster.signatures.rds")
# PACA 1
PACA.1.fit <- fit_to_signatures(PACA.1.mut.mat, snvs.ori.mut.cluster)
PACA.1.fit.df <- as.data.frame(PACA.1.fit$contribution)
PACA.1.fit.df <- as.data.frame(t(PACA.1.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, PACA.1 = cluster.2)
PACA.1.fit.df$dist <- as.numeric(PACA.1.fit.df$dist)
PACA.1.fit.df <- PACA.1.fit.df %>% dplyr::select(dist, value = PACA.1) %>% mutate(cluster = "PACA.1")
# PACA 2
PACA.2.fit <- fit_to_signatures(PACA.2.mut.mat, snvs.ori.mut.cluster)
PACA.2.fit.df <- as.data.frame(PACA.2.fit$contribution)
PACA.2.fit.df <- as.data.frame(t(PACA.2.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, PACA.2 = cluster.2)
PACA.2.fit.df$dist <- as.numeric(PACA.2.fit.df$dist)
PACA.2.fit.df <- PACA.2.fit.df %>% dplyr::select(dist, value = PACA.2) %>% mutate(cluster = "PACA.2")
# PACA 3
PACA.3.fit <- fit_to_signatures(PACA.3.mut.mat, snvs.ori.mut.cluster)
PACA.3.fit.df <- as.data.frame(PACA.3.fit$contribution)
PACA.3.fit.df <- as.data.frame(t(PACA.3.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, PACA.3 = cluster.2)
PACA.3.fit.df$dist <- as.numeric(PACA.3.fit.df$dist)
PACA.3.fit.df <- PACA.3.fit.df %>% dplyr::select(dist, value = PACA.3) %>% mutate(cluster = "PACA.3")
# Combine results and plot
PACA.summary.norm.fit.df <- rbind(PACA.1.norm.fit.df, PACA.2.norm.fit.df, PACA.3.norm.fit.df)
saveRDS(PACA.summary.norm.fit.df, file = "./002_Mut_signatures_Pan_cancer/sig/PACA.summary.norm.fit.df.rds")
Plot
library(dplyr)
library(ggplot2)
setwd("/Volumes/cephfs2/pmurat/OriCan")
# Load results
PACA.summary.norm.fit.df <- readRDS(file = "./002_Mut_signatures_Pan_cancer/sig/PACA.summary.norm.fit.df.rds")
# Plot
PACA.summary.norm.fit.plot <- PACA.summary.norm.fit.df %>%
ggplot(aes(x = dist, y = value, color = cluster)) +
geom_line() + xlim(-5000,5000) +
scale_color_manual(values = c("#E4221E", "#F39208", "#504B76")) +
geom_hline(yintercept=0, linetype="dashed") +
xlab("Distance from origins (bp)") + ylab("Cluster 2 signature\nrelative contribution") +
theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
PACA.summary.norm.fit.plot
pdf("./Rplots/PACA.summary.norm.fit.plot.pdf", width=10, height=6, useDingbats=FALSE)
PACA.summary.norm.fit.plot
dev.off()
## quartz_off_screen
## 2
Assess the potential strand asymmetry of cluster 2 SBS
# Load cluster 2 snvs gr list
cluster.2.snvs.grl <- readRDS("./002_Mut_signatures_Pan_cancer/sig/cluster.2.snvs.grl.rds")
# Split pyrimidine and purine mutations
cluster.2.pyr.grl <- cluster.2.snvs.grl
cluster.2.pur.grl <- cluster.2.snvs.grl
for (i in 1:length(cluster.2.snvs.grl)) {
print(i/length(cluster.2.snvs.grl))
gr.i <- cluster.2.snvs.grl[i]
bin.i <- names(gr.i)
unlist.gr.i <- unlist(gr.i)
pyr.gr.i <- unlist.gr.i[as.character(unlist.gr.i$REF) == "C" | as.character(unlist.gr.i$REF) == "T"]
pyr.gr.comp.i <- GRangesList(pyr.gr.i, compress=TRUE)
cluster.2.pyr.grl[i] <- pyr.gr.comp.i
pur.gr.i <- unlist.gr.i[as.character(unlist.gr.i$REF) == "G" | as.character(unlist.gr.i$REF) == "A"]
pur.gr.comp.i <- GRangesList(pur.gr.i, compress=TRUE)
cluster.2.pur.grl[i] <- pur.gr.comp.i
}
saveRDS(cluster.2.pyr.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.pyr.grl.rds")
saveRDS(cluster.2.pur.grl, file = "./002_Mut_signatures_Pan_cancer/sig/cluster.2.pur.grl.rds")
# Compute mutational matrices
cluster.2.pyr.mat <- mut_matrix(vcf_list = cluster.2.pyr.grl, ref_genome = ref.genome)
cluster.2.pur.mat <- mut_matrix(vcf_list = cluster.2.pur.grl, ref_genome = ref.genome)
# Fit signatures
cluster.2.pyr.fit <- fit_to_signatures(cluster.2.pyr.mat, snvs.ori.mut.cluster)
cluster.2.pur.fit <- fit_to_signatures(cluster.2.pur.mat, snvs.ori.mut.cluster)
# Format
cluster.2.pyr.fit.df <- as.data.frame(cluster.2.pyr.fit$contribution)
cluster.2.pyr.fit.df <- as.data.frame(t(cluster.2.pyr.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, value = cluster.2) %>% mutate(strand = "plus")
cluster.2.pur.fit.df <- as.data.frame(cluster.2.pur.fit$contribution)
cluster.2.pur.fit.df <- as.data.frame(t(cluster.2.pur.fit.df)) %>% add_rownames(var = "dist") %>% dplyr::select(dist, value = cluster.2) %>% mutate(strand = "minus")
cluster.2.strand.fit.df <- rbind(cluster.2.pyr.fit.df, cluster.2.pur.fit.df)
cluster.2.strand.fit.df$dist <- as.numeric(cluster.2.strand.fit.df$dist)
saveRDS(cluster.2.strand.fit.df, file = "./002_Mut_signatures_Pan_cancer/rds/cluster.2.strand.fit.df.rds")
Plot
library(dplyr)
library(ggplot2)
setwd("/Volumes/cephfs2/pmurat/OriCan")
cluster.2.strand.fit.df <- readRDS("./002_Mut_signatures_Pan_cancer/rds/cluster.2.strand.fit.df.rds")
cluster.2.strand.fit.plot <- cluster.2.strand.fit.df %>% ggplot(aes(x = dist, y = value, color = strand)) +
geom_line() + xlim(-5000, 5000) +
scale_color_manual(values = c("#514B76", "#D56114")) +
xlab("Distance from origin (bp)") + ylab("Absolute SBS\ncluster 2 contribution") +
theme_bw() + theme(aspect.ratio=1, panel.grid.minor = element_line(linetype = "dotted"))
cluster.2.strand.fit.plot
pdf("./Rplots/cluster.2.strand.fit.plot.pdf", width=7, height=6, useDingbats=FALSE)
cluster.2.strand.fit.plot
dev.off()
## quartz_off_screen
## 2
In order to identify the molecular pathway associated with cluster 2 SBS signature we performed a differential gene expression analysis based on the UMAP clustering.
Differential gene expression analysis is performed with DEseq from raw counts.
# r
library(dplyr)
library(DESeq2)
library(RColorBrewer)
library(gplots)
library(scales)
# Load clustering information
PACA.clusters.summary.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.clusters.summary.df.rds")
# Group donor by cluster
PACA.cluster.1.donor <- unique((PACA.clusters.summary.df %>% filter(cluster == "clust.1"))$donor) # 34 donors
PACA.cluster.2.donor <- unique((PACA.clusters.summary.df %>% filter(cluster == "clust.2"))$donor) # 33 donors
PACA.cluster.3.donor <- unique((PACA.clusters.summary.df %>% filter(cluster == "clust.3"))$donor) # 127 donors
# Prepare raw count matrix
ICGC.transc.TPM.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.df.rds")
ICGC.transc.PACA.cluster.1.TPM.df <- ICGC.transc.TPM.df %>% filter(donor %in% PACA.cluster.1.donor) %>% mutate(cluster = "PACA.1")
ICGC.transc.PACA.cluster.2.TPM.df <- ICGC.transc.TPM.df %>% filter(donor %in% PACA.cluster.2.donor) %>% mutate(cluster = "PACA.2")
ICGC.transc.PACA.cluster.3.TPM.df <- ICGC.transc.TPM.df %>% filter(donor %in% PACA.cluster.3.donor) %>% mutate(cluster = "PACA.3")
ICGC.transc.PACA.TPM.df <- rbind(ICGC.transc.PACA.cluster.1.TPM.df, ICGC.transc.PACA.cluster.2.TPM.df, ICGC.transc.PACA.cluster.3.TPM.df) # 194 donors
#
ICGC.transc.PACA.spread.df <- ICGC.transc.PACA.TPM.df %>% dplyr::select(donor, gene.id, raw.count) %>% distinct()
ICGC.transc.PACA.spread.df <- ICGC.transc.PACA.spread.df %>% pivot_wider(names_from = donor, values_from = raw.count) %>% as.data.frame()
row.names(ICGC.transc.PACA.spread.df) <- ICGC.transc.PACA.spread.df$gene.id
ICGC.transc.PACA.spread.df <- ICGC.transc.PACA.spread.df[,-1]
cts <- ICGC.transc.PACA.spread.df %>% drop_na()
cts <- cts %>% mutate(across(everything(), round, 1)) %>% mutate(across(everything(), ~as.integer(.)))
# Prepare condition df
coldata <- cbind.data.frame(donor = colnames(ICGC.transc.PACA.spread.df)) %>%
mutate(cluster = case_when(donor %in% PACA.cluster.1.donor ~ "PACA.1",
donor %in% PACA.cluster.2.donor ~ "PACA.2",
donor %in% PACA.cluster.3.donor ~ "PACA.3"))
coldata$cluster <- as.factor(coldata$cluster)
rownames(coldata) <- coldata$donor
# Check wehter the columns of the count matrix and the rows of the column data (information about samples) are in the same order.
all(rownames(coldata) %in% colnames(cts)) # TRUE
# Create DEseq experiement
dds <- DESeqDataSetFromMatrix(countData = cts,
colData = coldata,
design = ~ cluster)
dds
# Pre-filtering - Remove genes with less than 30 observations (as PACA 1 and PACA 2 have around 30 tumours)
smallestGroupSize <- 30
keep <- rowSums(counts(dds) >= 30) >= smallestGroupSize
dds <- dds[keep,] # 14,702 genes, 194 samples
# Differential expression analysis
dds <- DESeq(dds)
resultsNames(dds)
# "Intercept" "cluster_PACA.2_vs_PACA.1" "cluster_PACA.3_vs_PACA.1"
# Assess DEG associated with PACA clusters
# PACA1 vs PACA3
res.PACA1.vs.PACA3 <- results(dds, pAdjustMethod = "fdr", contrast = c("cluster", "PACA.1", "PACA.3"))
res.PACA1.vs.PACA3 <- as.data.frame(res.PACA1.vs.PACA3) %>% mutate(log10.pval = -log10(padj))
res.PACA1.vs.PACA3$gene.id <- row.names(res.PACA1.vs.PACA3)
saveRDS(res.PACA1.vs.PACA3, "./003_Transcriptomics_cluster_analysis/rds/PACA.1.vs.3.clust.DEseq.result.rds")
# PACA1 vs PACA2
res.PACA1.vs.PACA2 <- results(dds, pAdjustMethod = "fdr", contrast = c("cluster", "PACA.1", "PACA.2"))
res.PACA1.vs.PACA2 <- as.data.frame(res.PACA1.vs.PACA2) %>% mutate(log10.pval = -log10(padj))
res.PACA1.vs.PACA2$gene.id <- row.names(res.PACA1.vs.PACA2)
saveRDS(res.PACA1.vs.PACA2, "./003_Transcriptomics_cluster_analysis/rds/PACA.1.vs.2.clust.DEseq.result.rds")
Visualise result
library(dplyr)
library(ggplot2)
setwd("/Volumes/cephfs2/pmurat/OriCan")
DEseq.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/PACA.1.vs.3.clust.DEseq.result.rds")
# Add rank based on adjusted pvalues
DEseq.df <- DEseq.df %>% mutate(rank = dense_rank(-dplyr::desc(padj))) %>%
mutate(class = case_when(rank <= 2000 & log2FoldChange <= 0 ~ "down",
rank <= 2000 & log2FoldChange >= 0 ~ "up",
T ~ "non")) %>% arrange(-padj)
# Plot
DEseq.plot <- DEseq.df %>%
ggplot(aes(x=baseMean, y=log2FoldChange, colour=class)) +
geom_point() + scale_x_continuous(trans = 'log10') +
scale_color_manual(values = c("#504B76", "#D4D2D2", "#E4221E")) +
geom_hline(yintercept=0, linetype="dashed") +
xlab("Mean expression (baseMean - CPM)") + ylab("log2FC expression (PACA1 vs PACA3)") +
theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"), legend.position = "none")
DEseq.plot
pdf("./Rplots/PACA.DEseq.plot.pdf", width=7, height=6, useDingbats=FALSE)
DEseq.plot
dev.off()
## quartz_off_screen
## 2
Gene set enrichment analysis from DESeq result fgsea is the R version of the software developed by the Broad institute Gene sets are from the Molecular Signatures Database (MSigDB) and downloaded in a R format from https://bioinf.wehi.edu.au/MSigDB/v7.1/ (v7)
# r
# Load DEseq results
DE.PACA1.vs.PACA3.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.1.vs.3.clust.DEseq.result.rds")
DE.PACA1.vs.PACA2.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.1.vs.2.clust.DEseq.result.rds")
# Convert gene.id to NCBI gene ID
hsapiens.coding.genes <- readRDS("./003_Transcriptomics_cluster_analysis/rds/hsapiens.coding.genes.rds")
hsapiens.coding.genes.2 <- hsapiens.coding.genes %>% dplyr::select(gene.id = gene.symbol, ncbi.gene.id)
DE.PACA1.vs.PACA3.gsea.df <- DE.PACA1.vs.PACA3.df %>% left_join(hsapiens.coding.genes.2, by = "gene.id")
DE.PACA1.vs.PACA2.gsea.df <- DE.PACA1.vs.PACA2.df %>% left_join(hsapiens.coding.genes.2, by = "gene.id")
# Create ranks based on the DESeq Wald statistic (log2FoldChange/lfcSE)
PACA1.vs.PACA3.gseaDat.ranks <- DE.PACA1.vs.PACA3.gsea.df$stat
names(PACA1.vs.PACA3.gseaDat.ranks) <- DE.PACA1.vs.PACA3.gsea.df$ncbi.gene.id
PACA1.vs.PACA2.gseaDat.ranks <- DE.PACA1.vs.PACA2.gsea.df$stat
names(PACA1.vs.PACA2.gseaDat.ranks) <- DE.PACA1.vs.PACA2.gsea.df$ncbi.gene.id
# Load KEGG pathways
pathwaysKEGG <- readRDS("./Dataset/GSEA/Hs.c2.cp.kegg.v7.1.entrez.rds")
# Perform gsea
PACA1.vs.PACA3.fgseaKEGG <- fgsea(pathwaysKEGG, PACA1.vs.PACA3.gseaDat.ranks, minSize=15, maxSize = 500, nperm=1000)
PACA1.vs.PACA2.fgseaKEGG <- fgsea(pathwaysKEGG, PACA1.vs.PACA2.gseaDat.ranks, minSize=15, maxSize = 500, nperm=1000)
# Merge results
PACA1.vs.PACA3.fgseaKEGG.df <- PACA1.vs.PACA3.fgseaKEGG %>% dplyr::select(pathway, pval.PACA1.vs.PACA3 = pval, NES.PACA1.vs.PACA3 = NES, size, leadingEdge)
PACA1.vs.PACA2.fgseaKEGG.df <- PACA1.vs.PACA2.fgseaKEGG %>% dplyr::select(pathway, pval.PACA1.vs.PACA2 = pval, NES.PACA1.vs.PACA2 = NES)
fgseaKEGG.df <- PACA1.vs.PACA3.fgseaKEGG.df %>% left_join(PACA1.vs.PACA2.fgseaKEGG.df, by = "pathway") %>% mutate(NES = (NES.PACA1.vs.PACA3+NES.PACA1.vs.PACA2)/2)
# Combine pvalues according to the fisher method
fishersMethod <- function(x) pchisq(-2 * sum(log(x)),df=2*length(x),lower=FALSE)
pval <- vector()
for (i in 1:nrow(fgseaKEGG.df)) {
pval.i <- c(fgseaKEGG.df[i,]$pval.PACA1.vs.PACA3, fgseaKEGG.df[i,]$pval.PACA1.vs.PACA2)
pval.fisher.i <- fishersMethod(pval.i)
pval <- append(pval, pval.fisher.i)
}
fgseaKEGG.df$pval = pval
saveRDS(fgseaKEGG.df, "./003_Transcriptomics_cluster_analysis/rds/PACA.fgseaKEGG.df.rds")
Plot
library(dplyr)
library(ggplot2)
setwd("/Volumes/cephfs2/pmurat/OriCan")
fgseaKEGG.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/PACA.fgseaKEGG.df.rds")
# Ranked pathways
gseaDat.ranks.df <- cbind.data.frame(pathway = fgseaKEGG.df$pathway, NES = fgseaKEGG.df$NES) %>% mutate(rank = dense_rank(dplyr::desc(-NES)))
# Select pathway to highlight
# DNA repair and replication
DNA.repair.replication.keys <- c("REPAIR", "REPLICATION", "RECOMBINATION", "CELL_CYCLE")
DNA.repair.replication.pathway <- unique(grep(paste(DNA.repair.replication.keys, collapse="|"), gseaDat.ranks.df$pathway, value=TRUE))
# Glucose metabolism
Glucose.metabolism.keys <- c("FRUCTOSE", "GALACTOSE", "ASCORBATE", "PURINE", "PYRIMIDINE", "SUCROSE", "PYRUVATE", "GLUCO", "PENTOSE", "CITRATE", "NUCLEOTIDE")
Glucose.metabolism.pathway <- unique(grep(paste(Glucose.metabolism.keys, collapse="|"), gseaDat.ranks.df$pathway, value=TRUE))
# Glucose and cell proliferation
Cancer.keys <- c("CANCER", "MTOR", "NOTCH", "LEUKEMIA", "P53", "ERBB", "GLIOMA", "MELANOMA", "ADHESION", "APOPTOSIS", "VEGF", "WNT", "MAPK", "CARCINOMA", "JAK", "TGF")
Cancer.pathway <- unique(grep(paste(Cancer.keys, collapse="|"), gseaDat.ranks.df$pathway, value=TRUE))
# Add color codes
gseaDat.ranks.df <- gseaDat.ranks.df %>%
mutate(class = case_when(pathway %in% DNA.repair.replication.pathway ~ "DNA repair and replication",
pathway %in% Glucose.metabolism.pathway ~ "Carbohydrate metabolism",
pathway %in% Cancer.pathway ~ "Cancer and cell proliferation",
T ~ "others"))
gseaDat.ranks.plot <- gseaDat.ranks.df %>% ggplot(aes(x = rank, y = NES, fill = class)) +
geom_bar(stat = "identity") + xlab("Pathway ranks") + ylab("Normalised enrichment score") +
scale_fill_manual(values = c("#504B76", "#F39208", "#E52620", "#D4D2D2")) +
geom_hline(yintercept=0, linetype="dashed") +
theme_bw() + theme(aspect.ratio=0.5)
gseaDat.ranks.plot
pdf("./Rplots/PACA.fgseaKEGG.plot.pdf", width=7, height=6, useDingbats=FALSE)
gseaDat.ranks.plot
dev.off()
## quartz_off_screen
## 2
Prepare GSEA plots for pathway groups
library(ggpubr)
# r
# Recover gene id associated with pathway groups
Cancer.genes <- as.vector(unlist(pathwaysKEGG[names(pathwaysKEGG) %in% Cancer.pathway]))
Glucose.metabolism.genes <- as.vector(unlist(pathwaysKEGG[names(pathwaysKEGG) %in% Glucose.metabolism.pathway]))
DNA.repair.replication.genes <- as.vector(unlist(pathwaysKEGG[names(pathwaysKEGG) %in% DNA.repair.replication.pathway]))
# Combine p-values with the Fisher's method
Cancer.pval <- (PACA1.vs.PACA3.fgseaKEGG %>% filter(pathway %in% Cancer.pathway))$pval
Cancer.combine.pval <- fishersMethod(Cancer.pval)
Glucose.metabolism.pval <- (PACA1.vs.PACA2.fgseaKEGG %>% filter(pathway %in% Glucose.metabolism.pathway))$pval
Glucose.metabolism.combine.pval <- fishersMethod(Glucose.metabolism.pval)
DNA.repair.replication.pval <- (PACA1.vs.PACA2.fgseaKEGG %>% filter(pathway %in% DNA.repair.replication.pathway))$pval
DNA.repair.replication.combine.pval <- fishersMethod(DNA.repair.replication.pval)
# Plot enrichment
cancer.plot <- plotEnrichment(Cancer.genes, PACA1.vs.PACA3.gseaDat.ranks) + labs(title=paste("Cancer and cell proliferation | Pval =", signif(Cancer.combine.pval, 4))) + theme_bw() + theme(aspect.ratio=0.5)
gluc.plot <- plotEnrichment(Glucose.metabolism.genes, PACA1.vs.PACA2.gseaDat.ranks) + labs(title=paste("Glucose metabolism | Pval =", signif(Glucose.metabolism.combine.pval, 4))) + theme_bw() + theme(aspect.ratio=0.5)
repair.plot <- plotEnrichment(DNA.repair.replication.genes, PACA1.vs.PACA2.gseaDat.ranks) + labs(title=paste("DNA repair and replication | Pval =", signif(DNA.repair.replication.combine.pval, 4))) + theme_bw() + theme(aspect.ratio=0.5) # pval = 0.017496635
pdf("./Rplots/PACA.fgseaKEGG.gsea.plot.pdf", width=6, height=10, useDingbats=FALSE)
ggarrange(cancer.plot, gluc.plot, repair.plot, nrow = 3)
dev.off()
GSEA plots
The pathways related to DNA repair and Glucose metabolism are further charaterised by decreasing the level of analysis. Sub-pathways were manually retrieved from https://reactome.org/PathwayBrowser/.
# r
# 14 DNA repair and replication pathways were defined
# Define path
DDR.reactome.files <- list.files("./Dataset/Reactome/DNA_repair/")
DDR.reactome.path <- paste("./Dataset/Reactome/DNA_repair", DDR.reactome.files, sep = "/")
DDR.reactome.path.names <- gsub("\\..*", "", DDR.reactome.files)
DDR.reactome.path.names <- gsub("_", " ", DDR.reactome.path.names)
# Recover gene.id
DDR.reactome.df <- tibble()
for (i in 1:length(DDR.reactome.path)) {
df.i <- read.table(DDR.reactome.path[i], skip = 1) %>% mutate(V3 = DDR.reactome.path.names[i])
DDR.reactome.df <- rbind(DDR.reactome.df, df.i)
}
DDR.reactome.df <- DDR.reactome.df %>% dplyr::select(REact.name = V3, gene.id = V2) %>% left_join(hsapiens.coding.genes.2, by = "gene.id") %>% distinct()
DDR.reactome.df <- DDR.reactome.df %>% dplyr::select(-gene.id)
# Prepare list
DDR.reactome.list.df <- DDR.reactome.df %>% group_by(REact.name) %>% summarise(ncbi.gene.id = list(ncbi.gene.id))
DDR.reactome <- DDR.reactome.list.df$ncbi.gene.id
names(DDR.reactome) <- unique(DDR.reactome.list.df$REact.name)
# Perform gsea
fgseaDDR.reactome.PACA1.vs.PACA2 <- fgsea(DDR.reactome, PACA1.vs.PACA2.gseaDat.ranks, eps = 0.0)
fgseaDDR.reactome.PACA1.vs.PACA3 <- fgsea(DDR.reactome, PACA1.vs.PACA3.gseaDat.ranks, eps = 0.0)
# Combine pvalues
fgseaDDR.reactome.combine.pval <- vector()
for (i in 1:length(unique(fgseaDDR.reactome.PACA1.vs.PACA3$pathway))) {
pathway.i <- unique(fgseaDDR.reactome.PACA1.vs.PACA3$pathway)[i]
pval.1.i <- (fgseaDDR.reactome.PACA1.vs.PACA2 %>% filter(pathway == pathway.i))$pval
pval.2.i <- (fgseaDDR.reactome.PACA1.vs.PACA3 %>% filter(pathway == pathway.i))$pval
pval.fisher.i <- fishersMethod(c(pval.1.i, pval.2.i))
fgseaDDR.reactome.combine.pval <- append(fgseaDDR.reactome.combine.pval, pval.fisher.i)
}
fgseaDDR.reactome.df <- cbind.data.frame(pathway = unique(fgseaDDR.reactome.PACA1.vs.PACA3$pathway), pval = fgseaDDR.reactome.combine.pval) %>% left_join((fgseaDDR.reactome.PACA1.vs.PACA2 %>% dplyr::select(pathway, size, leadingEdge)), by = "pathway")
# add information on number of hits
hits.count <- vector()
for (i in 1:nrow(fgseaDDR.reactome.df)) {
leadingEdge.i <- fgseaDDR.reactome.df[i,]$leadingEdge
hits.count.i <- length(unlist(leadingEdge.i))
hits.count <- c(hits.count, hits.count.i)
}
fgseaDDR.reactome.df$hits.count <- hits.count
saveRDS(fgseaDDR.reactome.df, "./003_Transcriptomics_cluster_analysis/rds/fgseaDDR.reactome.df.rds")
# 11 carbohydrate metabolic pathways were defined
# Define path
Gluc.reactome.files <- list.files("./Dataset/Reactome/Glucose/")
Gluc.reactome.path <- paste("./Dataset/Reactome/Glucose/", Gluc.reactome.files, sep = "/")
Gluc.reactome.path.names <- gsub("\\..*", "", Gluc.reactome.files)
Gluc.reactome.path.names <- gsub("_", " ", Gluc.reactome.path.names)
# Recover gene.id
Gluc.reactome.df <- tibble()
for (i in 1:length(Gluc.reactome.path)) {
df.i <- read.table(Gluc.reactome.path[i], skip = 1) %>% mutate(V3 = Gluc.reactome.path.names[i])
Gluc.reactome.df <- rbind(Gluc.reactome.df, df.i)
}
Gluc.reactome.df <- Gluc.reactome.df %>% dplyr::select(REact.name = V3, gene.id = V2) %>% left_join(hsapiens.coding.genes.2, by = "gene.id") %>% distinct()
Gluc.reactome.df <- Gluc.reactome.df %>% dplyr::select(-gene.id)
# Prepare list
Gluc.reactome.list.df <- Gluc.reactome.df %>% group_by(REact.name) %>% summarise(ncbi.gene.id = list(ncbi.gene.id))
Gluc.reactome <- Gluc.reactome.list.df$ncbi.gene.id
names(Gluc.reactome) <- unique(Gluc.reactome.list.df$REact.name)
# Perform gsea
fgseaGluc.reactome.PACA1.vs.PACA2 <- fgsea(Gluc.reactome, PACA1.vs.PACA2.gseaDat.ranks, eps = 0.0)
fgseaGluc.reactome.PACA1.vs.PACA3 <- fgsea(Gluc.reactome, PACA1.vs.PACA3.gseaDat.ranks, eps = 0.0)
# Combine pvalues
fgseaGluc.reactome.combine.pval <- vector()
for (i in 1:length(unique(fgseaGluc.reactome.PACA1.vs.PACA3$pathway))) {
pathway.i <- unique(fgseaGluc.reactome.PACA1.vs.PACA3$pathway)[i]
pval.1.i <- (fgseaGluc.reactome.PACA1.vs.PACA2 %>% filter(pathway == pathway.i))$pval
pval.2.i <- (fgseaGluc.reactome.PACA1.vs.PACA3 %>% filter(pathway == pathway.i))$pval
pval.fisher.i <- fishersMethod(c(pval.1.i, pval.2.i))
fgseaGluc.reactome.combine.pval <- append(fgseaGluc.reactome.combine.pval, pval.fisher.i)
}
fgseaGluc.reactome.df <- cbind.data.frame(pathway = unique(fgseaGluc.reactome.PACA1.vs.PACA3$pathway), pval = fgseaGluc.reactome.combine.pval) %>% left_join((fgseaGluc.reactome.PACA1.vs.PACA2 %>% dplyr::select(pathway, size, leadingEdge)), by = "pathway")
# add information on number of hits
hits.count <- vector()
for (i in 1:nrow(fgseaGluc.reactome.df)) {
leadingEdge.i <- fgseaGluc.reactome.df[i,]$leadingEdge
hits.count.i <- length(unlist(leadingEdge.i))
hits.count <- c(hits.count, hits.count.i)
}
fgseaGluc.reactome.df$hits.count <- hits.count
saveRDS(fgseaGluc.reactome.df, "./003_Transcriptomics_cluster_analysis/rds/fgseaGluc.reactome.df.rds")
Plot
library(dplyr)
library(ggplot2)
library(ggpubr)
setwd("/Volumes/cephfs2/pmurat/OriCan")
# DDR
fgseaDDR.reactome.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/fgseaDDR.reactome.df.rds")
fgseaDDR.reactome.plot <- fgseaDDR.reactome.df %>%
mutate(hitsPerc=hits.count*100/size, log10.pval = -log10(pval)) %>%
arrange(desc(-log10.pval)) %>%
mutate(PATH = factor(pathway, levels = unique(pathway))) %>%
ggplot(aes(x= log10.pval,
y= PATH,
size=size)) +
geom_point(color = "#4F4A75") + xlim(0,25) + xlab("-log10 Pval") +
theme_bw() + theme(aspect.ratio=3, axis.title.y=element_blank())
fgseaDDR.reactome.plot
# Carbohydrate
fgseaGluc.reactome.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/fgseaGluc.reactome.df.rds")
fgseaGluc.reactome.plot <- fgseaGluc.reactome.df %>%
mutate(hitsPerc=hits.count*100/size, log10.pval = -log10(pval)) %>%
arrange(desc(-log10.pval)) %>%
mutate(PATH = factor(pathway, levels = unique(pathway))) %>%
ggplot(aes(x= log10.pval,
y= PATH,
size=size)) +
geom_point(color = "#4F4A75") + xlim(0,8) + xlab("-log10 Pval") +
theme_bw() + theme(aspect.ratio=3, axis.title.y=element_blank())
fgseaGluc.reactome.plot
pdf("./Rplots/PACA.DDR.Gluc.stat.plot.pdf", width=12, height=6, useDingbats=FALSE)
ggarrange(fgseaDDR.reactome.plot, fgseaGluc.reactome.plot, nrow = 1)
dev.off()
## quartz_off_screen
## 2
The gene set enrichment analysis revealed that both glycolysis and pentose phosphate pathway (PPP) pathways are dysregulated in PACA 1. Although PPP and glycolysis are distinct, they involve three common intermediates, glucose 6-phosphate, glyceraldehyde 3-phosphate, and fructose 6-phosphate, so the two pathways are interconnected. It worth noting that PPP is responsible for the generation of ribose 5-phosphate for nucleotide synthesis. We are, herein, asssessing which pathway is the more affected in order to establish how dysregulation of glucose metabolism may affect mutational burden at origins.
Assess the corelation between gene expression and snvs burden at origins for genes involved in both pathways
# We consider all genes previously associated to carbohydrate metabolism (Glucose.metabolism.genes)
# Recover gene id
Glucose.metabolism.gene.id <- (hsapiens.coding.genes.2 %>% filter(ncbi.gene.id %in% Glucose.metabolism.genes))$gene.id # 469 genes
# Load gene expression data
ICGC.transc.TPM.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/ICGC.transc.TPM.df.rds")
# Select donor from PACA clusters
PACA.clusters.summary.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.clusters.summary.df.rds")
PACA.clusters.summary.light.df <- PACA.clusters.summary.df %>% dplyr::select(donor, cluster) %>% distinct()
# Filter expression data
PACA.transc.TPM.df <- ICGC.transc.TPM.df %>% filter(donor %in% PACA.clusters.summary.light.df$donor) %>% left_join(PACA.clusters.summary.light.df, by = "donor")
# Compute Pearson correlations
# Only genes with TPM >= 1 and corelation with more than 10 observations are computed
PACA.cor.TPM.df <- tibble()
for (i in 1:length(Glucose.metabolism.gene.id)) {
GOI.i <- Glucose.metabolism.gene.id[i]
print(i/length(Glucose.metabolism.gene.id))
TPM.df.i <- PACA.transc.TPM.df %>% filter(gene.id == GOI.i & TPM >= 1)
if(nrow(TPM.df.i) <= 10){cor.df.i <- tibble()} else {
cor.i <- cor.test(log10(TPM.df.i$TPM), TPM.df.i$snvs.dens.ratio)
Rho.i <- as.numeric(cor.i$estimate)
pval.i <- as.numeric(cor.i$p.value)
cor.df.i <- cbind.data.frame(GOI = GOI.i, Rho = Rho.i, pval = pval.i)}
PACA.cor.TPM.df <- rbind(PACA.cor.TPM.df, cor.df.i)
}
# Label genes unique to the glycolysis, ppp and nucleotide biosynthesis pathways
glycolysis.genes <- read.table("./Dataset/Reactome/Glucose/Glycolysis.HSA-70326.tsv")$MoleculeName
ppp.genes <- read.table("./Dataset/Reactome/Glucose/Pentose_phosphate_pathway.HSA-71336.tsv")$MoleculeName
nucleotide.genes <- read.table("./Dataset/Reactome/Glucose/Nucleotide_biosynthesis.HSA-8956320.tsv")$MoleculeName
glycolysis.genes <- glycolysis.genes[!glycolysis.genes %in% ppp.genes] # 80 genes
ppp.genes <- ppp.genes[!ppp.genes %in% glycolysis.genes] # 15 genes
PACA.cor.TPM.df <- PACA.cor.TPM.df %>% mutate(pathway = case_when(GOI %in% glycolysis.genes ~ "Glycolysis",
GOI %in% ppp.genes ~ "Pentose phosphate pathway",
GOI %in% nucleotide.genes ~ "Nucleotide biosynthesis",
T ~ "others")) %>% distinct()
# Save
saveRDS(PACA.cor.TPM.df, "./003_Transcriptomics_cluster_analysis/rds/PACA.cor.TPM.df.rds")
# Prepare df for plotting relevant examples
PACA.transc.ATIC.df <- PACA.transc.TPM.df %>% filter(gene.id == "ATIC" & TPM >= 10)
saveRDS(PACA.transc.ATIC.df, "./003_Transcriptomics_cluster_analysis/rds/PACA.transc.ATIC.df.rds")
PACA.transc.TALD01.df <- PACA.transc.TPM.df %>% filter(gene.id == "TALDO1" & TPM >= 10)
saveRDS(PACA.transc.TALD01.df, "./003_Transcriptomics_cluster_analysis/rds/PACA.transc.TALD01.df.rds")
Plot
library(dplyr)
library(ggplot2)
library(ggpubr)
library(forcats)
setwd("/Volumes/cephfs2/pmurat/OriCan")
# Correlation plot
PACA.cor.TPM.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/PACA.cor.TPM.df.rds")
PACA.cor.TPM.plot <- PACA.cor.TPM.df %>% arrange(factor(pathway, levels = c("others", "Glycolysis", "Pentose phosphate pathway", "Nucleotide biosynthesis"))) %>%
mutate(pathway = fct_inorder(pathway)) %>%
ggplot(aes(x=Rho, y=-log10(pval), color=pathway)) +
geom_point() + xlim(-0.5,0.5) + xlab("Pearson correlation") + ylab("log10 Pval") +
scale_color_manual(values = c("#D4D2D2", "#504B76", "#F39208", "#E52620")) +
theme_bw() + theme(aspect.ratio=1.0, panel.grid.minor = element_line(linetype = "dotted"))
PACA.cor.TPM.plot
# Selected examples
# ATIC
PACA.transc.ATIC.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/PACA.transc.ATIC.df.rds")
PACA.transc.ATIC.plot <- PACA.transc.ATIC.df %>% ggplot(aes(x=TPM, y=snvs.dens.ratio)) +
geom_point(shape = 21, fill = "white") +
geom_smooth(method = lm, se = T, color = "#504B76") +
scale_x_continuous(trans = log10_trans()) +
xlab("Expression (log10 TPM)") + ylab("Mutation density ratio (ori/flanks)") + ggtitle("ATIC, Rho = -0.403, Pval = 6.352e-09") +
theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"))
PACA.transc.ATIC.plot
# TALD01
PACA.transc.TALD01.df <- readRDS("./003_Transcriptomics_cluster_analysis/rds/PACA.transc.TALD01.df.rds")
PACA.transc.TALD01.plot <- PACA.transc.TALD01.df %>% ggplot(aes(x=TPM, y=snvs.dens.ratio)) +
geom_point(shape = 21, fill = "white") +
geom_smooth(method = lm, se = T, color = "#504B76") +
scale_x_continuous(trans = log10_trans()) +
xlab("Expression (log10 TPM)") + ylab("Mutation density ratio (ori/flanks)") + ggtitle("TALD01, Rho = -0.372, Pval = 9.055e-08") +
theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"))
PACA.transc.TALD01.plot
pdf("./Rplots/PACA.PPP.plot.pdf", width=12, height=5, useDingbats=FALSE)
ggarrange(PACA.cor.TPM.plot, PACA.transc.ATIC.plot, PACA.transc.TALD01.plot, nrow = 1)
dev.off()
## quartz_off_screen
## 2
Previous results suggest that PACA 1 defines a subset of tumours characterised by general replicative stress (oncogene activation, DNA repair defficiency and altered glucose metabolism). In this section, we aim to characterise further the replication stress signature associated with each PACA cluster. We then assess the expression of known biomarkers of replicative stress in pancreatic cancer (Dreyer et al. Gastroenterology, 2021, 160, 362-377 - https://www.gastrojournal.org/article/S0016-5085(20)35229-X/fulltext).
# Define biomarkers (according to Figure 3E)
PACA.biomarkers <- c("KRAS", "CCND2", "NPM1", "MYC", "CCND1", "MDM2", "STAT5A", "HRAS", "CCNE1", "CCND3", "NRAS", "DEK", "CCNE2", "CDC6", "CDCA5", "AURKA", "MYBL2", "E2F1") # 18 genes
# Recover gene expression data and normalise expression (Z-score)
PACA.transc.biom.TPM.df <- PACA.transc.TPM.df %>% filter(gene.id %in% PACA.biomarkers) %>%
group_by(gene.id) %>% mutate(zscore = (raw.count-mean(raw.count, na.rm = T))/sd(raw.count, na.rm = T)) %>% ungroup()
# Compute mean scores for ordering heatmap
mean.zscore <- PACA.transc.biom.TPM.df %>% group_by(donor) %>% summarise(mean = mean(zscore)) %>% mutate(rank = dense_rank(-dplyr::desc(mean))) %>% arrange(rank) %>% dplyr::select(donor, rank)
# Order df
PACA.transc.biom.order.df <- PACA.transc.biom.TPM.df %>% dplyr::select(donor, gene.id, zscore, cluster) %>% left_join(mean.zscore, by = "donor")
# donor
PACA.transc.biom.order.df$donor <- factor(PACA.transc.biom.order.df$donor, levels=unique((PACA.transc.biom.order.df$donor)[order(PACA.transc.biom.order.df$rank)]))
# Prepare facets for plotting
PACA.transc.biom.order.df$facet <- ifelse(PACA.transc.biom.order.df$cluster %in% c("clust.1", "clust.2", "clust.3"), PACA.transc.biom.order.df$cluster)
# Save
saveRDS(PACA.transc.biom.order.df, "./003_Transcriptomics_cluster_analysis/rds/PACA.transc.biom.order.df.rds")
Plot heatmap
library(dplyr)
library(ggplot2)
setwd("/Volumes/cephfs2/pmurat/OriCan")
PACA.transc.biom.order.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.transc.biom.order.df.rds")
PACA.transc.biom.plot <- PACA.transc.biom.order.df %>% ggplot(aes(donor, gene.id, fill= zscore)) +
geom_tile() +
scale_fill_gradientn(colours = c("#4F4A75", "white", "#E52620"), limits=c(-2, 2), na.value = '#E52620') +
theme_bw() + theme(axis.title.y=element_blank(), axis.text.x=element_text(angle=45,hjust=1)) +
facet_grid(.~facet, space = "free", scales = "free_x")
PACA.transc.biom.plot
pdf("./Rplots/PACA.transc.biom.plot.pdf", width=8, height=3, useDingbats=FALSE)
PACA.transc.biom.plot
dev.off()
## quartz_off_screen
## 2
This analysis suggests: - cluster 1 and 2 are characterised by high replication stress (oncogene overexpression). - cluster 2 show higher expression for cyclins D. Both tumours in cluster 1 and 2 diplay features of squamous carcinoma.
This section aims to assess whether cell proliferation and cell cycle are dysregulated in the different identified PACA clusters.
Cell proliferation is assessed by looking at cyclin D1 expression.
# r
# Recover gene expression
PACA.CCND1.TPM.df <- PACA.transc.TPM.df %>% filter(gene.id == "CCND1")
saveRDS(PACA.CCND1.TPM.df, "./003_Transcriptomics_cluster_analysis/rds/PACA.CCND1.TPM.df.rds")
# Compute stats
# Comparing clusters
ks.test(PACA.CCND1.TPM.df[which(PACA.CCND1.TPM.df$cluster == "clust.1"),]$TPM, PACA.CCND1.TPM.df[which(PACA.CCND1.TPM.df$cluster == "clust.2"),]$TPM) # p-value = 0.343
ks.test(PACA.CCND1.TPM.df[which(PACA.CCND1.TPM.df$cluster == "clust.2"),]$TPM, PACA.CCND1.TPM.df[which(PACA.CCND1.TPM.df$cluster == "clust.3"),]$TPM) # p-value < 2.2e-16
# Correlation between CCND1 expression and mutational burden at origins
cor.test(log10(PACA.CCND1.TPM.df$TPM), PACA.CCND1.TPM.df$snvs.dens.ratio) # 0.4046014, p-value = 4.882e-09
Plot
library(dplyr)
library(ggplot2)
library(ggpubr)
setwd("/Volumes/cephfs2/pmurat/OriCan")
PACA.CCND1.TPM.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.CCND1.TPM.df.rds")
PACA.CCND1.plot <- PACA.CCND1.TPM.df %>%
ggplot(aes(x=cluster, y=TPM, fill=cluster)) +
geom_boxplot(alpha = 0.6) + ggtitle("CCND1, Pval = 0.343, < 2.2e-16") +
scale_y_continuous(trans = 'log10', limits = c(0.05, 2000)) +
scale_fill_manual(values = c("#E41F1A", "#F39200", "#4F4A75")) +
theme_bw() + theme(aspect.ratio=2, legend.position = "none", axis.title.x=element_blank(), axis.text.x=element_text(angle=45,hjust=1))
PACA.CCND1.plot
PACA.CCND1.snvs.plot <- PACA.CCND1.TPM.df %>% ggplot(aes(x=TPM, y=snvs.dens.ratio)) +
geom_point(shape = 21, fill = "white") +
geom_smooth(method = lm, se = T, color = "#504B76") +
scale_x_continuous(trans = log10_trans()) +
xlab("Expression (log10 TPM)") + ylab("Mutation density ratio (ori/flanks)") + ggtitle("CCND1, Rho = 0.404, Pval = 4.882e-09") +
geom_hline(yintercept=1, linetype="dashed") +
theme_bw() + theme(aspect.ratio=2, panel.grid.minor = element_line(linetype = "dotted"))
PACA.CCND1.snvs.plot
pdf("./Rplots/PACA.CCND1.plot.pdf", width=8, height=6, useDingbats=FALSE)
ggarrange(PACA.CCND1.plot, PACA.CCND1.snvs.plot, nrow = 1)
dev.off()
## quartz_off_screen
## 2
Cell cycle is estimated by the ratio of cyclin expression: - cyclin E / others: G1/S - cyclin A / others: S/G2 - cyclin B / others : G2/M
#r
# Select cyclin encoding genes
cyclins <- c("CCNA1", "CCNA2", "CCNB1", "CCNB2", "CCND1", "CCND2", "CCND3", "CCNE1", "CCNE2")
# Recover expression data and format df
PACA.transc.TPM.spread.df <- PACA.transc.TPM.df %>% dplyr::select(donor, gene.id, TPM) %>% distinct() %>% pivot_wider(names_from = gene.id, values_from = TPM, values_fn = first)
PACA.cyclin.TPM.df <- PACA.transc.TPM.spread.df %>% dplyr::select(all_of(cyclins))
# Compute expression ratios
PACA.cell.cycle.df <- PACA.cyclin.TPM.df %>% mutate(SUM = CCNA1 + CCNA2 + CCNB1 + CCNB2 + CCNE1 + CCNE2) %>%
mutate(G1_S = (CCNE1 + CCNE2)/SUM, S_G2 = (CCNA1 + CCNA2)/SUM, G2_M = (CCNB1 + CCNB2)/SUM)
# Normalise each cycle
PACA.cell.cycle.df <- PACA.cell.cycle.df %>% dplyr::select(donor, G1_S, S_G2, G2_M) %>%
mutate(cell.sum = G1_S + S_G2 + G2_M,
G1_S = G1_S/cell.sum,
S_G2 = S_G2/cell.sum,
G2_M = G2_M/cell.sum) %>% dplyr::select(-cell.sum)
# Add cluster information
PACA.cell.cycle.df <- PACA.cell.cycle.df %>% left_join((PACA.clusters.summary.df %>% dplyr::select(donor, cluster)), by = "donor")
saveRDS(PACA.cell.cycle.df, "./003_Transcriptomics_cluster_analysis/rds/PACA.cell.cycle.df.rds")
# Compute stats
# G1/S phase
ks.test(PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.1"),]$G1_S, PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.2"),]$G1_S, alternative = "greater") # p-value = 0.007093
ks.test(PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.2"),]$G1_S, PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.3"),]$G1_S, alternative = "greater") # p-value = 0.2203
# S/G2 phase
ks.test(PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.1"),]$S_G2, PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.2"),]$S_G2, alternative = "less") # p-value = 6.198e-08
ks.test(PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.2"),]$S_G2, PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.3"),]$S_G2, alternative = "less") # p-value = 0.7036
# G2/M phase
ks.test(PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.1"),]$G2_M, PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.2"),]$G2_M, alternative = "greater") # p-value = 1.145e-06
ks.test(PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.2"),]$G2_M, PACA.cell.cycle.df[which(PACA.cell.cycle.df$cluster == "clust.3"),]$G2_M, alternative = "greater") # p-value = 0.8931
Plot
library(dplyr)
library(tidyr)
library(ggplot2)
setwd("/Volumes/cephfs2/pmurat/OriCan")
PACA.cell.cycle.df <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.cell.cycle.df.rds")
PACA.cell.cycle.plot <- PACA.cell.cycle.df %>%
ungroup() %>% dplyr::select(G1_S, S_G2, G2_M, cluster) %>%
gather("phase", "value", -cluster) %>% mutate(PHASE = fct_relevel(phase, "G1_S", "S_G2", "G2_M")) %>%
ggplot(aes(x=PHASE, y=value, fill=cluster)) +
geom_boxplot(alpha = 0.6) + ylab("Fraction") + ylim(0, 0.8) +
scale_fill_manual(values = c("#E41F1A", "#F39200", "#4F4A75")) +
theme_bw() + theme(aspect.ratio=1, axis.title.x=element_blank())
PACA.cell.cycle.plot
pdf("./Rplots/PACA.cell.cycle.plot.pdf", width=7, height=6, useDingbats=FALSE)
PACA.cell.cycle.plot
dev.off()
## quartz_off_screen
## 2
This analysis shows: - PACA 1 and PACA 2 are characterised by increased cell proliferation. - PACA 1 is characterised by a longer S-phase and shorted G1/G2 phases.
Estimate cancer aggressiveness from survival analysis
# r
library(survminer)
# Recover donor information
donor.info <- read.table("./Dataset/ICGC/donor.all_projects.tsv", sep = "\t", header = T)
donor.info <- donor.info %>% dplyr::select(donor = icgc_donor_id, survival = donor_survival_time)
# Add cluster information
PACA.donor.info <- donor.info %>% left_join((PACA.clusters.summary.df %>% dplyr::select(donor, cluster)), by = "donor") %>% drop_na() %>% distinct()
saveRDS(PACA.donor.info, "./003_Transcriptomics_cluster_analysis/rds/PACA.donor.info.rds")
# Compute stats
ks.test(PACA.donor.info[which(PACA.donor.info$cluster == "clust.1"),]$survival, PACA.donor.info[which(PACA.donor.info$cluster == "clust.3"),]$survival, alternative = "greater") # p-value = 0.006931
ks.test(PACA.donor.info[which(PACA.donor.info$cluster == "clust.1"),]$survival, PACA.donor.info[which(PACA.donor.info$cluster == "clust.2"),]$survival, alternative = "greater") # p-value = 0.01849
Plot
library(dplyr)
library(ggplot2)
setwd("/Volumes/cephfs2/pmurat/OriCan")
PACA.donor.info <- readRDS(file = "./003_Transcriptomics_cluster_analysis/rds/PACA.donor.info.rds")
PACA.donor.survival.plot <- PACA.donor.info %>% ggplot(aes(survival, colour = cluster)) +
stat_ecdf() + scale_y_reverse() +
scale_color_manual(values = c("#E41F1A", "#F39200", "#4F4A75")) +
xlab("Days") + ylab("Percent survival") + xlim(0,3000) + ggtitle("Kaplan-Meier curves | Pval = 0.006931 & 0.01849") +
theme_bw() + theme(aspect.ratio=1)
PACA.donor.survival.plot
pdf("./Rplots/PACA.donor.survival.plot.pdf", width=7, height=6, useDingbats=FALSE)
PACA.donor.survival.plot
dev.off()
## quartz_off_screen
## 2